
| STA 141B Final Project |
| Analysis of Happiness |
Shang Jiang : 915497630
Jiahui Li :915544392
Songkai Xiao :915576357
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg
from scipy import stats, integrate
from mpl_toolkits.mplot3d import Axes3D
sns.set(color_codes=True)
%matplotlib inline
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (12,8)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly import __version__
import cufflinks as cf
from sklearn.preprocessing import scale
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
hp15 = pd.read_csv("./world-happiness-report/2015.csv")
hp16 = pd.read_csv("./world-happiness-report/2016.csv")
hp17 = pd.read_csv("./world-happiness-report/2017.csv")
raw = pd.read_csv("raw_data.csv")
hp15 = hp15.set_index('Country')
hp16 = hp16.set_index('Country')
hp17 = hp17.set_index('Country')
hp17 = hp17.rename(columns = {'Whisker.high':'Upper Confidence Interval','Whisker.low':'Lower Confidence Interval',
'Economy..GDP.per.Capita.':'Economy (GDP per Capita)',
'Health..Life.Expectancy.':'Health (Life Expectancy)',
'Trust..Government.Corruption.':'Trust (Government Corruption)',
'Dystopia.Residual':'Dystopia Residual'})
hp17 = hp17.join(hp16.Region)
raw['Happiness Score'] = raw['Life Ladder']
hpall = raw['Happiness Score'].dropna()
gdpall = raw['GDP per capita'].dropna()
socialall = raw['Social support'].dropna()
freedomall = raw ['Freedom to make life choices'].dropna()
healthyall = raw['Healthy life expectancy at birth'].dropna()
corruptionall = raw['Perceptions of corruption'].dropna()
generosityall = raw['Generosity'].dropna()
sns.set(rc={'figure.figsize':(20,20)})
plt.subplot(421)
sns.distplot(hpall)
plt.subplot(422)
sns.distplot(gdpall)
plt.subplot(423)
sns.distplot(socialall)
plt.subplot(424)
sns.distplot(freedomall)
plt.subplot(425)
sns.distplot(healthyall)
plt.subplot(426)
sns.distplot(corruptionall)
plt.subplot(427)
sns.distplot(generosityall)
Since GPD is severely right-skewed, which will effect our analysis, so we make transformation, taking log, on it.
raw['Log GDP per capita']= [math.log(x) for x in raw['GDP per capita']]
gdpall = raw['Log GDP per capita'].dropna()
sns.set(rc={'figure.figsize':(35,35)})
plt.subplot(421)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Log GDP per capita", data=raw)
ax.set(title = "Log GDP per capita By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(422)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Happiness Score", data=raw)
ax.set(title = "Happiness Score By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(423)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Social support", data=raw)
ax.set(title = "Social support By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(424)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Freedom to make life choices", data=raw)
ax.set(title = "Freedom to make life choices By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(425)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Healthy life expectancy at birth", data=raw)
ax.set(title = "Healthy life expectancy at birth By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(426)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Perceptions of corruption", data=raw)
ax.set(title = "Perceptions of corruption By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
plt.subplot(427)
sns.set(font_scale=3)
ax = sns.violinplot(x="year", y="Generosity", data=raw)
ax.set(title = "Generosity By Year", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)
ax.tick_params(labelsize=30)
From the plot, we found that the data of 2005 is abnormal, the distribution of which is much different from those of other years, so we take a careful look on the data of year 2015, to check if they should remain in the data we use to the further analysis.
pd.DataFrame(raw.year.value_counts()).T
The data of 2005 only record 27 countries, which is much less than the othe years, so we guess the abnormal distribution is because the size of data is smaller, not because of other experimental or recording errors.
We plot the distribution of every variable of 2005 with that of whole data to have a further check.
data05 = raw[raw.year==2005]
sns.set(rc={'figure.figsize':(20,20)})
plt.subplot(421)
ax = sns.distplot(hpall)
ax = sns.distplot(data05['Happiness Score'].dropna())
plt.subplot(422)
ax = sns.distplot(gdpall)
ax = sns.distplot(data05['Log GDP per capita'].dropna())
plt.subplot(423)
ax = sns.distplot(socialall)
ax = sns.distplot(data05['Social support'])
plt.subplot(424)
ax = sns.distplot(freedomall)
ax = sns.distplot(data05['Freedom to make life choices'].dropna())
plt.subplot(425)
ax = sns.distplot(healthyall)
ax = sns.distplot(data05['Healthy life expectancy at birth'].dropna())
plt.subplot(426)
ax = sns.distplot(corruptionall)
ax = sns.distplot(data05['Perceptions of corruption'].dropna())
plt.subplot(427)
ax = sns.distplot(generosityall)
ax = sns.distplot(data05['Generosity'].dropna())
From the plot, we can find the distribution of data of 2005 year is similar to the higher part of the distribution of whole data. When we take a closer look of the data, we find the countries included in 2005 data are mainly developed country, maybe because the report only collect the happiness score of some developed countries in the first years.
raw.rename(columns={'country':'Country'},inplace =True)
raw = raw.set_index('Country')
raw = raw.join(hp15.Region)
sns.set(rc={'figure.figsize':(30,30)})
plt.subplot(321)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Happiness Score', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Happiness Score By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
plt.subplot(322)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Log GDP per capita', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Log GDP per capita By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
plt.subplot(323)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Social support', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Social support By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
plt.subplot(324)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Healthy life expectancy at birth', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Healthy life expectancy at birth By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
plt.subplot(325)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Freedom to make life choices', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Freedom By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
plt.subplot(326)
sns.set(font_scale=3)
ax = sns.boxplot(x = 'Perceptions of corruption', y = 'Region',data=raw) #draw the barplot
ax.set(title = "Corruption By Region", xlabel = "", ylabel = "")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.tick_params(labelsize=16)
def drawworld(df, name, year):
data = dict(type = 'choropleth',
locations = df.index,
locationmode = 'country names',
colorscale = [[0,"rgb(153, 51, 51)"],[0.5,"rgb(240, 240, 240)"],[1,"rgb(255, 153, 204)"]],
z = df[name],
text = df.index,
colorbar = {'title':str(name)})
layout = dict(title = 'World '+str(name) + ' in ' +str(year),
geo = dict(showframe = False, projection = {'type': 'Mercator'}))
choromap3 = go.Figure(data = [data], layout=layout)
iplot(choromap3)
data16 = raw[raw.year ==2016]
data16 =data16.set_index('country')
drawworld(hp16,'Happiness Score',2016)
drawworld(data16,'Log GDP per capita',2016)
drawworld(data16,'Social support',2016)
From the world distribution, we can find that the distributions of Happiness Score, GDP, Social Support, Healthy life expectancy are similar, so obviously, they have not weak correaltion between happiness score with GDP, Social Support and Healthy life expectancy.
data16_new=data16.dropna()
data16_new_hp = data16_new['Happiness Score']
data = pd.DataFrame(scale(data16_new),index=data16_new.index, columns= data16_new.columns)
data['Happiness Score'] = data16_new_hp
data = data.sort_values(['Happiness Score'],ascending=True)
cf.go_offline()
data[['Happiness Score','Log GDP per capita','Social support','Healthy life expectancy at birth']].iplot(kind='spread')
cf.go_offline()
data[['Happiness Score','Freedom to make life choices','Generosity']].iplot(kind='spread')
cf.go_offline()
data[['Happiness Score','Perceptions of corruption']].iplot(kind='spread')
From the regreesion result, it appears these variables we choose(i.e Log GDP per capita, Social support, Healthy life expectancy at birth, Freedom to make life choices, Perception of corruption) contribute greatly to the happiness score.
The following question comes naturally:
(i) Can these countries be separated into different groups? Are these groups distinct from each other?
(ii) If yes, what's the characteristics of these groups?
Now, the method K-Means Clustering plays an important role to answer these questions.
Since some variables of some countries in our data are missing for some reasons (not recorded, not covered, etc.). But we still want to fully utilize these imcomplete data, here we get the mean of each variable of each country given the data we have; and take the mean as the primary index of these variables.
data = pd.read_csv('final_model_data.csv')
data.set_index('country', inplace = True)
data.dropna(inplace = True)
country_index = data[['Log GDP per capita', 'Social support','Healthy life expectancy at birth', 'Freedom to make life choices',
'Perceptions of corruption']]
country_index_year_average = country_index.groupby('country').agg(np.mean)
X = country_index_year_average
X.shape
data.columns
X.head()
Because the K-means Clustering method is scale-sensitive (the algorithm it use calculates the "distances" between variables and thus the different scales are influential!), we should standardize the data first.
std_data = pd.DataFrame(scale(X),index=X.index, columns= X.columns)
std_data.head()
Now, we are ready to cluster!
Based on the visualization and regression result, we decide to choose three clusters. and 'k-means++' to select initial cluster centers for k-mean clustering in a smart way to speed up convergence.
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10, random_state=669)
kmeans.fit(std_data)
std_data['cluster'] = kmeans.fit_predict(std_data)
std_data.head()
std_data.tail()
std_data.cluster.value_counts().plot.bar(fontsize = 12, title = 'Cluster', figsize = (10,10))
Above we get the clustering results. Most countries belong to Cluster1, and fewest countries belong to Cluster0.
But it's still hard to see the results clearly since they have too many variables(columns). Fortunately, we can take advantage of PCA to visualize the clustering results.
pca = PCA(n_components=3)
pc_X = pca.fit_transform(std_data) #PCA
print(pc_X.shape)
print(pca.explained_variance_ratio_)
print(pca.components_)
PC_X_df = pd.DataFrame(pc_X,columns= ['PCA1','PCA2','PCA3'],index=X.index)
PC_X_df.head()
The first two principal components explain about 80% variability of the data; and the first three explain about 90%. The first three components appear good enough.
PC_cluster = pd.concat([PC_X_df,std_data['cluster']],axis = 1)
PC_cluster.head()
PC_cluster.PCA1['Norway'],PC_cluster.PCA2['Norway'] # the cordinate of Norway
x, y = PC_cluster.PCA1, PC_cluster.PCA2
ind1,ind2,ind3 = PC_cluster.index[PC_cluster.cluster==0],PC_cluster.index[PC_cluster.cluster==1],PC_cluster.index[PC_cluster.cluster==2]
fig = plt.figure(figsize = (12,12))
ax = plt.subplot(111)
ax.scatter(x[ind1], y[ind1], c='black',s = 100,label='cluster 0')
ax.scatter(x[ind2], y[ind2], c='red',s = 100,label='cluster 1')
ax.scatter(x[ind3], y[ind3], c='b',s = 100,label='cluster 2')
plt.text(PC_cluster.PCA1['Norway'],PC_cluster.PCA2['Norway'],'Norway',fontsize = 20)
plt.text(PC_cluster.PCA1['Central African Republic'],PC_cluster.PCA2['Central African Republic'],'Central African Republic',fontsize = 20)
plt.text(PC_cluster.PCA1['South Korea'],PC_cluster.PCA2['South Korea'],'South Korea',fontsize = 20)
ax.set_ylabel('PCA2')
ax.set_xlabel('PCA1')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
x, y, z = PC_cluster.PCA1, PC_cluster.PCA2, PC_cluster.PCA3
ind1,ind2,ind3 = PC_cluster.index[PC_cluster.cluster==0],PC_cluster.index[PC_cluster.cluster==1],PC_cluster.index[PC_cluster.cluster==2]
fig = plt.figure(figsize = (12,12))
ax = plt.subplot(111, projection='3d')
ax.scatter(x[ind1], y[ind1], z[ind1], c='black',s = 100,label='cluster 0')
ax.scatter(x[ind2], y[ind2], z[ind2], c='r',s = 100,label='cluster 1')
ax.scatter(x[ind3], y[ind3], z[ind3], c='b',s = 100,label='cluster 2')
ax.set_zlabel('PCA3')
ax.set_ylabel('PCA2')
ax.set_xlabel('PCA1')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
plt.show()
From the 2D Scatter Plot(here I text three representative country in each group: 'Norway', '), we can see the clustering performs well: The three groups are separated clearly. And it shows that Cluster0, Cluster1, Cluster2 are corresponding to the "least happy", "moderately happy", "happiest" country groups, respectively. From the 3D Scatter Plot, we can see that the clustering performance is also great.
Now, let's look at each group closely with the 2017 happiness data.
And then we gain insights into the data using Pivot Table to compare clusters.
hp17_rank = hp17[['Happiness.Rank','Happiness.Score']]
myPC = PC_cluster.join(hp17_rank).dropna()
myPC[myPC.cluster == 2].sort_values('Happiness.Rank').head()
myPC[myPC.cluster == 1].sort_values('Happiness.Rank').head()
myPC[myPC.cluster == 0].sort_values('Happiness.Rank',ascending = False).head()
The three tables give the PCAs of the three clusters and their happiness ranks and scores.
final_df = country_index_year_average.join(myPC)
final_df.head()
pivot_table_cluster = final_df.pivot_table(index='cluster', values=['Happiness.Rank','Happiness.Score','Log GDP per capita', 'Social support',
'Healthy life expectancy at birth', 'Freedom to make life choices',
'Perceptions of corruption', 'PCA1', 'PCA2','PCA3'],aggfunc = np.mean)
pivot_table_cluster
cluter_index_summary = pivot_table_cluster.sort_values('Happiness.Rank')[['Happiness.Rank','Happiness.Score','Log GDP per capita', 'Social support',
'Healthy life expectancy at birth', 'Freedom to make life choices',
'Perceptions of corruption', 'PCA1', 'PCA2','PCA3']]
cluter_index_summary
The pivot table give a summary for us to compare the clusters.
It appears clearly that: The happiest group: have the higgest Log GDP per capital, Healthy life expectancy, Freedom to make life choices and the trust of government(the smaller 'Perceptions of corruption' is, the more confidence they have in the government).
The least happy group: have the lowest Log GDP per capital, Healthy life expectancy, Freedom to make life choices and the trust of government(the smaller 'Perceptions of corruption' is, the more confidence they have in the government).
Thus, we can draw a conclusion: happiness score is highly and positively correlated with the Log GDP per capital, Healthy life expectancy, Freedom to make life choices and the trust of government.